{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "228bfca6-72ee-4656-b7c4-7694ab085518",
   "metadata": {
    "papermill": {
     "duration": 0.022209,
     "end_time": "2021-10-20T19:08:32.378722",
     "exception": false,
     "start_time": "2021-10-20T19:08:32.356513",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# nlp-transform-snippets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "excess-grass",
   "metadata": {
    "papermill": {
     "duration": 0.00735,
     "end_time": "2021-10-20T19:08:32.395594",
     "exception": false,
     "start_time": "2021-10-20T19:08:32.388244",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "creates snippets out of large text files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "assigned-lottery",
   "metadata": {
    "papermill": {
     "duration": 2.08022,
     "end_time": "2021-10-20T19:08:34.482980",
     "exception": false,
     "start_time": "2021-10-20T19:08:32.402760",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!pip3 install wget==3.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "impaired-sharing",
   "metadata": {
    "papermill": {
     "duration": 0.149253,
     "end_time": "2021-10-20T19:08:34.646994",
     "exception": false,
     "start_time": "2021-10-20T19:08:34.497741",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import wget\n",
    "import logging\n",
    "import numpy as np\n",
    "import os\n",
    "import re\n",
    "import shutil\n",
    "import sys\n",
    "import tarfile\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "local-gather",
   "metadata": {
    "papermill": {
     "duration": 0.040563,
     "end_time": "2021-10-20T19:08:34.706262",
     "exception": false,
     "start_time": "2021-10-20T19:08:34.665699",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# file name for training data zip\n",
    "input_filename = os.environ.get('input_filename ', 'data.zip')\n",
    "\n",
    "# resulting model zip file name\n",
    "output_model_zip = os.environ.get('output_model_zip', 'model.zip')\n",
    "\n",
    "# temporal data storage for local execution\n",
    "data_dir = os.environ.get('data_dir', '../../data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1cd43db-f550-4bc9-8353-e73bbf1e39ad",
   "metadata": {
    "papermill": {
     "duration": 0.030396,
     "end_time": "2021-10-20T19:08:34.760590",
     "exception": false,
     "start_time": "2021-10-20T19:08:34.730194",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "parameters = list(\n",
    "  map(\n",
    "      lambda s: re.sub('$', '\"', s),\n",
    "      map(\n",
    "          lambda s: s.replace('=', '=\"'),\n",
    "          filter(\n",
    "              lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\\/A-Za-z0-9]*', s)),\n",
    "              sys.argv\n",
    "          )\n",
    "      )\n",
    "  )\n",
    ")\n",
    "\n",
    "for parameter in parameters:\n",
    "    logging.warning('Parameter: '+parameter) \n",
    "    exec(parameter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f366a1f7-3b7b-4366-b39d-8361d7b90e0d",
   "metadata": {
    "papermill": {
     "duration": 0.075319,
     "end_time": "2021-10-20T19:08:34.845222",
     "exception": true,
     "start_time": "2021-10-20T19:08:34.769903",
     "status": "failed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "source_folder=str(time.time())\n",
    "shutil.unpack_archive(data_dir + input_filename, extract_dir=data_dir + source_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49bac2b7-2414-4079-9178-b0ec4f16009e",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# TODO generalize\n",
    "letter = 'abcdefghijklmnopqrstuvwxyz'\n",
    "digits = '0123456789'\n",
    "others = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n",
    "alphabet = letter + digits + others\n",
    "print('alphabet size:', len(alphabet))\n",
    "\n",
    "# all-zeroes padding vector:\n",
    "pad_vector = [0 for x in alphabet]\n",
    "\n",
    "# pre-calculated one-hot vectors:\n",
    "supported_chars_map = {}\n",
    "\n",
    "for i, ch in enumerate(alphabet):\n",
    "  vec = [0 for x in alphabet]\n",
    "  vec[i] = 1\n",
    "  supported_chars_map[ch] = vec\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1820b45c-9bd6-4e4f-bae6-8a9abf1402e7",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_source_snippets(file_name, breakup=True):\n",
    "    # Read the file content and lower-case:                                    \n",
    "    text = \"\"\n",
    "    with open(file_name, mode='r') as file:\n",
    "        text = file.read().lower()\n",
    "    lines = text.split('\\n')\n",
    "    nlines = len(lines)\n",
    "    if breakup and nlines > 50:\n",
    "        aThird = nlines//3\n",
    "        twoThirds = 2*aThird\n",
    "        text1 = '\\n'.join(lines[:aThird])\n",
    "        text2 = '\\n'.join(lines[aThird:twoThirds])\n",
    "        text3 = '\\n'.join(lines[twoThirds:])\n",
    "        return [text1, text2, text3]\n",
    "    return [text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b285107-7227-458f-82ac-14a42a720728",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def turn_sample_to_vector(sample, sample_vectors_size=1024,\n",
    "                          normalize_whitespace=True):\n",
    "    if normalize_whitespace:\n",
    "        # Map (most) white-space to space and compact to single one:\n",
    "        sample = sample.replace('\\n', ' ').replace('\\r', ' ').replace('\\t', ' ')\n",
    "        sample = re.sub('\\s+', ' ', sample)\n",
    "\n",
    "    # Encode the characters to one-hot vectors:\n",
    "    sample_vectors = []\n",
    "    for ch in sample:\n",
    "        if ch in supported_chars_map:\n",
    "            sample_vectors.append(supported_chars_map[ch])\n",
    "\n",
    "    # Truncate to fixed length:\n",
    "    sample_vectors = sample_vectors[0:sample_vectors_size]\n",
    "\n",
    "    # Pad with 0 vectors:\n",
    "    if len(sample_vectors) < sample_vectors_size:\n",
    "        for i in range(0, sample_vectors_size - len(sample_vectors)):\n",
    "            sample_vectors.append(pad_vector)\n",
    "\n",
    "    return np.array(sample_vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f133310-24f6-43f0-aae8-e9cfd9058805",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def turn_file_to_vectors(file_name, sample_vectors_size=1024, normalize_whitespace=True, breakup=True):\n",
    "    samples = get_source_snippets(file_name, breakup)\n",
    "    return [turn_sample_to_vector(s, sample_vectors_size, normalize_whitespace) for s in samples]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18ff68a6-b9bd-485f-b41c-43a13049bf16",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def get_input_and_labels(root_folder, sample_vectors_size=1024, breakup=True):\n",
    "    X = []\n",
    "    Y = []\n",
    "    for i, lang in enumerate(langs):\n",
    "        print('Processing language:', lang)\n",
    "        # One-hot class label vector:\n",
    "        class_label = [0 for x in range(0, num_classes)]\n",
    "        class_label[i] = 1\n",
    "        # For all files in language folder:\n",
    "        folder = os.path.join(root_folder, lang)\n",
    "        for fn in os.listdir(folder):\n",
    "            if fn.startswith(\".\"):\n",
    "                continue  # Skip hidden files and Jupyterlab cache directories\n",
    "            file_name = os.path.join(folder, fn)\n",
    "            sample_vectors = turn_file_to_vectors(file_name,\n",
    "                                                sample_vectors_size=sample_vectors_size,\n",
    "                                                breakup=breakup)\n",
    "            for fv in sample_vectors:\n",
    "                X.append(fv)                 # the sample feature vector\n",
    "                Y.append(class_label)        # the class ground-truth\n",
    "\n",
    "    return np.array(X, dtype=np.int8), np.array(Y, dtype=np.int8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2711390a-0551-4a3c-8a01-641ba515a3db",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# TODO generalize\n",
    "langs = [\n",
    "  \"C\",\n",
    "  \"C#\",\n",
    "  \"C++\",\n",
    "  \"D\",\n",
    "  \"Haskell\",\n",
    "  \"Java\",\n",
    "  \"JavaScript\",\n",
    "  \"PHP\",\n",
    "  \"Python\",\n",
    "  \"Rust\"\n",
    "]\n",
    "\n",
    "num_classes = len(langs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "878f9504-41e7-493f-a9d1-a50e8c8c6170",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "x, y = get_input_and_labels(root_folder=data_dir + source_folder + '/train') #TODO use data folder\n",
    "\n",
    "# Shuffle data\n",
    "shuffle_indices = np.random.permutation(np.arange(len(y)))\n",
    "x_shuffled = x[shuffle_indices]\n",
    "y_shuffled = y[shuffle_indices]\n",
    "\n",
    "print('samples shape', x_shuffled.shape)\n",
    "print('class labels shape:', y_shuffled.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e82ce8bf-facf-4208-a287-376c27937284",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras.models import Sequential, Model\n",
    "from tensorflow.keras.layers import Activation, Dense, Dropout, Flatten, Input\n",
    "from tensorflow.keras.layers import Conv1D, MaxPooling1D, Concatenate\n",
    "\n",
    "# Model Hyperparameters\n",
    "kernel_sizes = (3, 9, 19)\n",
    "pooling_sizes = (3, 9, 19)\n",
    "num_filters = 128\n",
    "dropout_prob = 0.5\n",
    "hidden_dims = 128\n",
    "\n",
    "stage_in = Input(shape=(1024, 68))\n",
    "convs = []\n",
    "for i in range(0, len(kernel_sizes)):\n",
    "  conv = Conv1D(filters=num_filters,\n",
    "                kernel_size=kernel_sizes[i],\n",
    "                padding='valid',\n",
    "                activation='relu',\n",
    "                strides=1)(stage_in)\n",
    "  pool = MaxPooling1D(pool_size=pooling_sizes[i])(conv)\n",
    "  flatten = Flatten()(pool)\n",
    "  convs.append(flatten)\n",
    "\n",
    "if len(kernel_sizes) > 1:\n",
    "    out = Concatenate()(convs)\n",
    "else:\n",
    "    out = convs[0]\n",
    "\n",
    "stages = Model(inputs=stage_in, outputs=out)\n",
    "\n",
    "model = Sequential([\n",
    "    stages,\n",
    "    Dense(hidden_dims, activation='relu'),\n",
    "    Dropout(dropout_prob),\n",
    "    Dense(num_classes, activation='softmax')\n",
    "])\n",
    "\n",
    "model.summary()\n",
    "\n",
    "# Note: also need pydot and GraphViz installed for this.\n",
    "#from tensorflow.keras.utils import plot_model                               \n",
    "#plot_model(model, show_shapes=True, expand_nested=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66558141-f739-4822-883a-f352f2282327",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "batch_size = 64\n",
    "num_epochs = 20\n",
    "val_split = 0.1\n",
    "\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam',\n",
    "              metrics=['accuracy'])\n",
    "\n",
    "history = model.fit(x_shuffled, y_shuffled, batch_size=batch_size,\n",
    "                    epochs=num_epochs, validation_split=val_split,\n",
    "                    verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7160b805-3179-4987-85fa-6c5bdce9b46c",
   "metadata": {
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_folder=str(time.time())\n",
    "model.save(data_dir + model_folder)\n",
    "shutil.make_archive(data_dir + output_model_zip.split('.zip')[0], 'zip', data_dir + model_folder)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 3.711158,
   "end_time": "2021-10-20T19:08:35.263542",
   "environment_variables": {},
   "exception": true,
   "input_path": "/home/jovyan/work/claimed/component-library/nlp/nlp-classify-text-simple.ipynb",
   "output_path": "/home/jovyan/work/claimed/component-library/nlp/nlp-classify-text-simple.ipynb",
   "parameters": {},
   "start_time": "2021-10-20T19:08:31.552384",
   "version": "2.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
